#matplotlib inline
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
# Standard packages
import json
# Lib to create maps
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
# Reading GeoJSON with Taiwan administrative borders
with open('Documents/county.geojson', encoding="utf-8") as file:
district_borders = json.loads(file.read())
# To display stuff in notebook
from IPython.display import display, Markdown
#Read csv file
df = pd.read_csv('Documents/Taiwan_Air_Quality_Data_by_Hours.csv', low_memory=False)
df.head()
| SiteName | County | AQI | Pollutant | Status | SO2 | CO | O3 | O3_8hr | PM10 | ... | WindDirec | DataCreationDate | Unit | CO_8hr | PM2.5_AVG | PM10_AVG | SO2_AVG | Longitude | Latitude | SiteId | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 基隆 | 基隆市 | 30.0 | NaN | 良好 | 1.1 | 0.22 | 37 | 35.0 | 14.0 | ... | 79.0 | 2016-11-25 13:00 | NaN | 0.28 | 3.0 | 9 | NaN | NaN | NaN | NaN |
| 1 | 汐止 | 新北市 | 23.0 | NaN | 良好 | 0.7 | 0.26 | 28 | 27.0 | 21.0 | ... | 41.0 | 2016-11-25 13:00 | NaN | 0.31 | 6.0 | 11 | NaN | NaN | NaN | NaN |
| 2 | 萬里 | 新北市 | 34.0 | NaN | 良好 | 1.2 | 0.2 | 42 | 40.0 | 31.0 | ... | 65.0 | 2016-11-25 13:00 | NaN | 0.2 | 3.0 | 36 | NaN | NaN | NaN | NaN |
| 3 | 新店 | 新北市 | 29.0 | NaN | 良好 | 1.3 | 0.38 | 31 | 31.0 | 11.0 | ... | 99.0 | 2016-11-25 13:00 | NaN | 0.36 | 9.0 | 6 | NaN | NaN | NaN | NaN |
| 4 | 土城 | 新北市 | 25.0 | NaN | 良好 | 1.6 | 0.38 | 30 | 29.0 | 28.0 | ... | 154.0 | 2016-11-25 13:00 | NaN | 0.42 | 4.0 | 13 | NaN | NaN | NaN | NaN |
5 rows × 25 columns
df.tail()
| SiteName | County | AQI | Pollutant | Status | SO2 | CO | O3 | O3_8hr | PM10 | ... | WindDirec | DataCreationDate | Unit | CO_8hr | PM2.5_AVG | PM10_AVG | SO2_AVG | Longitude | Latitude | SiteId | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3517798 | 屏東(枋寮) | 屏東縣 | 126.0 | 細懸浮微粒 | 對敏感族群不健康 | 1.0 | 0.54 | 34.8 | 64 | 60 | ... | 54 | 2021-12-31 19:00 | NaN | 0.5 | 45.0 | 63 | 1.0 | 120.590369 | 22.384742 | 313.0 |
| 3517799 | 基隆 | 基隆市 | 37.0 | NaN | 良好 | 1.5 | 0.21 | 39.3 | 40 | 19 | ... | 86 | 2021-12-31 20:00 | NaN | 0.2 | 11.0 | 28 | 1.0 | 121.760056 | 25.129167 | 1.0 |
| 3517800 | 汐止 | 新北市 | 37.0 | NaN | 良好 | 2.0 | 0.26 | 29.5 | 34 | 17 | ... | 25 | 2021-12-31 20:00 | NaN | 0.2 | 12.0 | 26 | 1.0 | 121.642300 | 25.067131 | 2.0 |
| 3517801 | 萬里 | 新北市 | 44.0 | NaN | 良好 | 0.5 | 0.17 | 48.4 | 48 | 32 | ... | 55 | 2021-12-31 20:00 | NaN | 0.1 | 12.0 | 44 | 0.0 | 121.689881 | 25.179667 | 3.0 |
| 3517802 | 龍潭 | 桃園市 | 35.0 | NaN | 良好 | 1.3 | 0.21 | 36.6 | 38 | 11 | ... | 38 | 2021-12-31 20:00 | NaN | 0.2 | 10.0 | 20 | 1.0 | 121.216350 | 24.863869 | 21.0 |
5 rows × 25 columns
df.describe()
| AQI | SO2 | NO2 | NOx | NO | Unit | PM2.5_AVG | SO2_AVG | Longitude | Latitude | SiteId | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3.478820e+06 | 3.430662e+06 | 3.408523e+06 | 3.405580e+06 | 3.405192e+06 | 0.0 | 3.462344e+06 | 2.929376e+06 | 2.604989e+06 | 2.604989e+06 | 1.819081e+06 |
| mean | 5.691617e+01 | 2.392665e+00 | 1.204513e+01 | 1.585746e+01 | 3.811408e+00 | NaN | 1.777821e+01 | 2.194460e+00 | 1.208333e+02 | 2.406168e+01 | 5.373173e+01 |
| std | 3.114143e+01 | 2.090307e+00 | 9.039715e+00 | 1.584270e+01 | 9.118370e+00 | NaN | 1.212031e+01 | 1.258580e+00 | 6.139191e-01 | 9.864816e-01 | 5.851711e+01 |
| min | -1.000000e+00 | -9.990000e+02 | -2.778000e+01 | -1.600000e+00 | -7.200000e+00 | NaN | -1.000000e+00 | -1.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
| 25% | 3.300000e+01 | 1.400000e+00 | 5.600000e+00 | 6.900000e+00 | 8.000000e-01 | NaN | 9.000000e+00 | 1.000000e+00 | 1.203455e+02 | 2.311510e+01 | 2.100000e+01 |
| 50% | 4.900000e+01 | 2.000000e+00 | 9.800000e+00 | 1.180000e+01 | 1.500000e+00 | NaN | 1.500000e+01 | 2.000000e+00 | 1.207417e+02 | 2.415196e+01 | 4.200000e+01 |
| 75% | 7.400000e+01 | 2.900000e+00 | 1.600000e+01 | 1.900000e+01 | 3.000000e+00 | NaN | 2.400000e+01 | 3.000000e+00 | 1.214519e+02 | 2.501700e+01 | 6.400000e+01 |
| max | 5.000000e+02 | 1.800000e+02 | 3.510500e+02 | 4.310000e+02 | 3.913100e+02 | NaN | 2.450000e+02 | 6.000000e+01 | 1.217929e+02 | 2.616047e+01 | 3.130000e+02 |
print("Column datatypes: ")
Column datatypes:
print(df)
SiteName County AQI Pollutant Status SO2 CO O3 O3_8hr \
0 基隆 基隆市 30.0 NaN 良好 1.1 0.22 37 35.0
1 汐止 新北市 23.0 NaN 良好 0.7 0.26 28 27.0
2 萬里 新北市 34.0 NaN 良好 1.2 0.2 42 40.0
3 新店 新北市 29.0 NaN 良好 1.3 0.38 31 31.0
4 土城 新北市 25.0 NaN 良好 1.6 0.38 30 29.0
... ... ... ... ... ... ... ... ... ...
3517798 屏東(枋寮) 屏東縣 126.0 細懸浮微粒 對敏感族群不健康 1.0 0.54 34.8 64
3517799 基隆 基隆市 37.0 NaN 良好 1.5 0.21 39.3 40
3517800 汐止 新北市 37.0 NaN 良好 2.0 0.26 29.5 34
3517801 萬里 新北市 44.0 NaN 良好 0.5 0.17 48.4 48
3517802 龍潭 桃園市 35.0 NaN 良好 1.3 0.21 36.6 38
PM10 ... WindDirec DataCreationDate Unit CO_8hr PM2.5_AVG \
0 14.0 ... 79.0 2016-11-25 13:00 NaN 0.28 3.0
1 21.0 ... 41.0 2016-11-25 13:00 NaN 0.31 6.0
2 31.0 ... 65.0 2016-11-25 13:00 NaN 0.2 3.0
3 11.0 ... 99.0 2016-11-25 13:00 NaN 0.36 9.0
4 28.0 ... 154.0 2016-11-25 13:00 NaN 0.42 4.0
... ... ... ... ... ... ... ...
3517798 60 ... 54 2021-12-31 19:00 NaN 0.5 45.0
3517799 19 ... 86 2021-12-31 20:00 NaN 0.2 11.0
3517800 17 ... 25 2021-12-31 20:00 NaN 0.2 12.0
3517801 32 ... 55 2021-12-31 20:00 NaN 0.1 12.0
3517802 11 ... 38 2021-12-31 20:00 NaN 0.2 10.0
PM10_AVG SO2_AVG Longitude Latitude SiteId
0 9 NaN NaN NaN NaN
1 11 NaN NaN NaN NaN
2 36 NaN NaN NaN NaN
3 6 NaN NaN NaN NaN
4 13 NaN NaN NaN NaN
... ... ... ... ... ...
3517798 63 1.0 120.590369 22.384742 313.0
3517799 28 1.0 121.760056 25.129167 1.0
3517800 26 1.0 121.642300 25.067131 2.0
3517801 44 0.0 121.689881 25.179667 3.0
3517802 20 1.0 121.216350 24.863869 21.0
[3517803 rows x 25 columns]
# check the data type and find which column is not suitable for data analyzing
df.dtypes
SiteName object County object AQI float64 Pollutant object Status object SO2 float64 CO object O3 object O3_8hr object PM10 object PM2.5 object NO2 float64 NOx float64 NO float64 WindSpeed object WindDirec object DataCreationDate object Unit float64 CO_8hr object PM2.5_AVG float64 PM10_AVG object SO2_AVG float64 Longitude float64 Latitude float64 SiteId float64 dtype: object
# choose those columns that change the object into float
cols_to_float = ['CO', 'O3', 'O3_8hr', 'PM10', 'PM2.5', 'WindSpeed', 'WindDirec', 'CO_8hr', 'PM10_AVG']
# fill non-convertible values with NaN
df[cols_to_float] = df[cols_to_float].apply(pd.to_numeric, errors='coerce')
# change the object into float inorder to analyze
df[cols_to_float] = df[cols_to_float].astype(float)
# change the 'DataCreationDate' from object into DateTime
df['DataCreationDate'] = pd.to_datetime(df['DataCreationDate'])
# check again
df.dtypes
SiteName object County object AQI float64 Pollutant object Status object SO2 float64 CO float64 O3 float64 O3_8hr float64 PM10 float64 PM2.5 float64 NO2 float64 NOx float64 NO float64 WindSpeed float64 WindDirec float64 DataCreationDate datetime64[ns] Unit float64 CO_8hr float64 PM2.5_AVG float64 PM10_AVG float64 SO2_AVG float64 Longitude float64 Latitude float64 SiteId float64 dtype: object
# Detect missing values.
df.isna().any()
SiteName False County False AQI True Pollutant True Status True SO2 True CO True O3 True O3_8hr True PM10 True PM2.5 True NO2 True NOx True NO True WindSpeed True WindDirec True DataCreationDate False Unit True CO_8hr True PM2.5_AVG True PM10_AVG True SO2_AVG True Longitude True Latitude True SiteId True dtype: bool
# Find is it any null in column and calculate the total numbers of each column.
df.isnull().sum()
SiteName 0 County 0 AQI 38983 Pollutant 1824360 Status 5958 SO2 87141 CO 112289 O3 155157 O3_8hr 124667 PM10 101616 PM2.5 155929 NO2 109280 NOx 112223 NO 112611 WindSpeed 189650 WindDirec 189509 DataCreationDate 0 Unit 3517803 CO_8hr 83250 PM2.5_AVG 55459 PM10_AVG 71961 SO2_AVG 588427 Longitude 912814 Latitude 912814 SiteId 1698722 dtype: int64
df.isnull().mean()
SiteName 0.000000 County 0.000000 AQI 0.011082 Pollutant 0.518608 Status 0.001694 SO2 0.024771 CO 0.031920 O3 0.044106 O3_8hr 0.035439 PM10 0.028886 PM2.5 0.044326 NO2 0.031065 NOx 0.031901 NO 0.032012 WindSpeed 0.053911 WindDirec 0.053871 DataCreationDate 0.000000 Unit 1.000000 CO_8hr 0.023665 PM2.5_AVG 0.015765 PM10_AVG 0.020456 SO2_AVG 0.167271 Longitude 0.259484 Latitude 0.259484 SiteId 0.482893 dtype: float64
# choose those columns to interpolate
columns_to_interpolate = ['AQI', 'SO2', 'CO', 'O3', 'O3_8hr', 'PM10', 'PM2.5', 'NO2', 'NOx', 'NO', 'WindSpeed', 'WindDirec', 'CO_8hr', 'PM2.5_AVG', 'PM10_AVG', 'SO2_AVG']
# use interpolate to impute missing value
df_data = df[columns_to_interpolate].interpolate()
# check the numbers of NaN
print(df_data.isnull().sum())
AQI 0 SO2 0 CO 0 O3 0 O3_8hr 0 PM10 0 PM2.5 0 NO2 0 NOx 0 NO 0 WindSpeed 0 WindDirec 0 CO_8hr 0 PM2.5_AVG 0 PM10_AVG 0 SO2_AVG 537768 dtype: int64
The number of missing values for SO2_AVG has not changed, so perhaps we should consider using another method to impute the missing values, such as statistical values like the mean or median.
status_counts = df['Status'].value_counts()
# Set mandarin font
plt.rcParams['font.sans-serif'] = ['Taipei Sans TC Beta']
# Set figure size and title
plt.figure(figsize=(9,7))
plt.title('Distribution of status values', fontsize=16, fontweight='bold')
# Draw bar plot
sns.barplot(x = status_counts.values, y = status_counts.index)
# Indicate the number of each histogram
for i, v in enumerate(status_counts.values):
plt.text(v + 3, i + .15, str(v), color='blue', fontweight='bold')
# Set label
plt.xlabel('Count', fontsize=12)
plt.ylabel('Status', fontsize=12)
# Show bar plot
plt.show()
site_filter = df[['SiteName', 'County', 'Longitude', 'Latitude', 'SiteId']].copy()
# Remove missing values.
site_filter = site_filter.dropna()
# Return DataFrame with duplicate rows removed
site_filter = site_filter.drop_duplicates(inplace=False)
site_filter.head()
| SiteName | County | Longitude | Latitude | SiteId | |
|---|---|---|---|---|---|
| 1698658 | 基隆 | 基隆市 | 121.760056 | 25.129167 | 1.0 |
| 1698659 | 汐止 | 新北市 | 121.642300 | 25.067131 | 2.0 |
| 1698660 | 萬里 | 新北市 | 121.689881 | 25.179667 | 3.0 |
| 1698661 | 新店 | 新北市 | 121.537778 | 24.977222 | 4.0 |
| 1698726 | 復興 | 高雄市 | 120.312017 | 22.608711 | 71.0 |
#set map and the center is 臺灣地理中心碑
site_loc = folium.Map(location=[23.974046888716416, 120.97979604298477], tiles='cartodbpositron', zoom_start=8)
# Add points to the map
for idx, row in site_filter.iterrows():
Marker([row['Latitude'], row['Longitude']], popup=row['SiteName']).add_to(site_loc)
# Adding borders
folium.GeoJson(
district_borders,
name='geojson'
).add_to(site_loc)
# Display the map
site_loc